Loading Libraries

require("readxl")
## Loading required package: readxl
## Warning: package 'readxl' was built under R version 3.5.3
library(readxl)

require("dplyr")
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dplyr)

require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.5.3
library(tidyr)

require("rfm")
## Loading required package: rfm
## Warning: package 'rfm' was built under R version 3.5.3
library(rfm)

require("data.table")
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.3
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(data.table)

Importing data

cust_agg_tran <- read_excel("cust agg transaction data.xlsx")
head(cust_agg_tran)
## # A tibble: 6 x 9
##   customer_Number billCount item_Count MaxTransactionDate 
##             <dbl>     <dbl>      <dbl> <dttm>             
## 1               1         1          1 2017-02-11 00:00:00
## 2               2         1          4 2017-02-10 00:00:00
## 3               3         1          1 2017-02-14 00:00:00
## 4               4         1          2 2017-02-20 00:00:00
## 5               5         1          2 2017-02-28 00:00:00
## 6               6         1          1 2017-02-15 00:00:00
## # ... with 5 more variables: MinTransactionDate <dttm>, MaxAmount <dbl>,
## #   TotalAmount <dbl>, sale_Amount <dbl>, sale_Percent <dbl>
reference_file <- read_excel("Reference file.xlsx")
head(reference_file)
## # A tibble: 6 x 12
##   MemberCardNumber Customer_Group Revenue Invoice Nationality_Gro~
##              <dbl> <chr>            <dbl>   <dbl> <chr>           
## 1                1 2.NEWLY ACTIVE    4.25       1 REMAINING       
## 2                2 2.NEWLY ACTIVE   12.5        1 REMAINING       
## 3                3 2.NEWLY ACTIVE    4.1        1 REMAINING       
## 4                4 2.NEWLY ACTIVE    6.25       1 REMAINING       
## 5                5 2.NEWLY ACTIVE   32.4        1 REMAINING       
## 6                6 2.NEWLY ACTIVE    4.7        1 REMAINING       
## # ... with 7 more variables: NationalityClassification <chr>,
## #   Stateclassification <chr>, AgeClassification <chr>, RevenueBand <chr>,
## #   InvoiceBand <chr>, OUTLIEROVERALL <chr>, Sampling <chr>

Combining data of reference and cust_agg_tran in one

orignalData <- cbind(cust_agg_tran[1:49811,], reference_file)



orignalData<-
  orignalData %>% separate(Customer_Group, into=c("Customer_Group_Num", "Customer_Group"), sep= "\\.") %>% 
  separate(RevenueBand, into=c("RevenueBand_Num", "RevenueBand"), sep= "\\.")%>%
  separate(InvoiceBand, into=c("InvoiceBand_Num", "InvoiceBand"), sep= "\\.") %>%
  separate(Sampling, into=c("Sampling_Num", "Sampling"), sep= "\\.")

head(orignalData)
##   customer_Number billCount item_Count MaxTransactionDate
## 1               1         1          1         2017-02-11
## 2               2         1          4         2017-02-10
## 3               3         1          1         2017-02-14
## 4               4         1          2         2017-02-20
## 5               5         1          2         2017-02-28
## 6               6         1          1         2017-02-15
##   MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1         2017-02-11        44          44           0            0
## 2         2017-02-10        45         155         155            1
## 3         2017-02-14        25          25          25            1
## 4         2017-02-20        25          45          45            1
## 5         2017-02-28        25          50          50            1
## 6         2017-02-15        35          35          35            1
##   MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1                1                  2   NEWLY ACTIVE    4.25       1
## 2                2                  2   NEWLY ACTIVE   12.50       1
## 3                3                  2   NEWLY ACTIVE    4.10       1
## 4                4                  2   NEWLY ACTIVE    6.25       1
## 5                5                  2   NEWLY ACTIVE   32.41       1
## 6                6                  2   NEWLY ACTIVE    4.70       1
##   Nationality_Group NationalityClassification Stateclassification
## 1         REMAINING                 REMAINING                NONE
## 2         REMAINING                 REMAINING                NONE
## 3         REMAINING                 REMAINING                NONE
## 4         REMAINING                 REMAINING                NONE
## 5         REMAINING                 REMAINING                NONE
## 6         REMAINING                 REMAINING                NONE
##   AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1         REMAINING               1    Below 25               1
## 2         REMAINING               3       50-75               1
## 3         REMAINING               1    Below 25               1
## 4         REMAINING               2       25-50               1
## 5         REMAINING               4    Above 75               1
## 6         REMAINING               1    Below 25               1
##   InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1    Below 25              N            1       TG
## 2    Below 25              N            1       TG
## 3    Below 25              N            1       TG
## 4    Below 25              N            1       TG
## 5    Below 25              N            1       TG
## 6    Below 25              N            1       TG

STEP 1

#From the Transaction Agg file, 
#select customers classified as "Active" 
#under the column Customer Group the reference file

active_Customers<- filter(orignalData, Customer_Group=="ACTIVE")
head(active_Customers)
##   customer_Number billCount item_Count MaxTransactionDate
## 1              22         1          2         2017-02-10
## 2              23         1          3         2017-02-08
## 3              25         1          2         2017-02-20
## 4              26         1          2         2017-02-17
## 5              27         1          1         2017-02-08
## 6              29         1          4         2017-02-10
##   MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1         2017-02-10        30          60          60       1.0000
## 2         2017-02-08        40         119          80       0.6722
## 3         2017-02-20        65         114           0       0.0000
## 4         2017-02-17        25          30           5       0.1666
## 5         2017-02-08        35          35          35       1.0000
## 6         2017-02-10        40          85          85       1.0000
##   MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1               22                  1         ACTIVE   28.15       1
## 2               23                  1         ACTIVE   11.05       1
## 3               25                  1         ACTIVE    9.80       1
## 4               26                  1         ACTIVE   13.10       1
## 5               27                  1         ACTIVE   16.20       5
## 6               29                  1         ACTIVE   30.30       1
##      Nationality_Group NationalityClassification Stateclassification
## 1               KUWAIT                    KUWAIT                NONE
## 2               KUWAIT                    KUWAIT                NONE
## 3               KUWAIT                    KUWAIT                NONE
## 4 Indian Sub Continent      INDIAN SUB CONTINENT                NONE
## 5               KUWAIT                    KUWAIT                NONE
## 6               KUWAIT                    KUWAIT                NONE
##   AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1         REMAINING               2       25-50               1
## 2   18-25 and 26-33               1    Below 25               1
## 3   18-25 and 26-33               1    Below 25               1
## 4         REMAINING               2       25-50               1
## 5         REMAINING               2       25-50               3
## 6         REMAINING               2       25-50               1
##   InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1    Below 25              N            1       TG
## 2    Below 25              N            1       TG
## 3    Below 25              N            1       TG
## 4    Below 25              N            1       TG
## 5       50-75              N            1       TG
## 6    Below 25              N            1       TG

STEP 2

#Further filter customers in step 1 for Sampling = "TG"

act_TG_Customers <- filter(active_Customers, Sampling=="TG")
DT::datatable(act_TG_Customers)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

#STEP 3

#From step 2, Select the customers 
#whose Avg Transaction Value (Amount per bill) is 
#above KD 25 in the transaction file
act_TG_Customers$AmountPerBill <- act_TG_Customers$TotalAmount/act_TG_Customers$billCount

act_TG_KD25_Customers <- filter(act_TG_Customers, AmountPerBill> 25.0)
head(act_TG_KD25_Customers)
##   customer_Number billCount item_Count MaxTransactionDate
## 1              22         1          2         2017-02-10
## 2              23         1          3         2017-02-08
## 3              25         1          2         2017-02-20
## 4              26         1          2         2017-02-17
## 5              27         1          1         2017-02-08
## 6              29         1          4         2017-02-10
##   MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1         2017-02-10        30          60          60       1.0000
## 2         2017-02-08        40         119          80       0.6722
## 3         2017-02-20        65         114           0       0.0000
## 4         2017-02-17        25          30           5       0.1666
## 5         2017-02-08        35          35          35       1.0000
## 6         2017-02-10        40          85          85       1.0000
##   MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1               22                  1         ACTIVE   28.15       1
## 2               23                  1         ACTIVE   11.05       1
## 3               25                  1         ACTIVE    9.80       1
## 4               26                  1         ACTIVE   13.10       1
## 5               27                  1         ACTIVE   16.20       5
## 6               29                  1         ACTIVE   30.30       1
##      Nationality_Group NationalityClassification Stateclassification
## 1               KUWAIT                    KUWAIT                NONE
## 2               KUWAIT                    KUWAIT                NONE
## 3               KUWAIT                    KUWAIT                NONE
## 4 Indian Sub Continent      INDIAN SUB CONTINENT                NONE
## 5               KUWAIT                    KUWAIT                NONE
## 6               KUWAIT                    KUWAIT                NONE
##   AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1         REMAINING               2       25-50               1
## 2   18-25 and 26-33               1    Below 25               1
## 3   18-25 and 26-33               1    Below 25               1
## 4         REMAINING               2       25-50               1
## 5         REMAINING               2       25-50               3
## 6         REMAINING               2       25-50               1
##   InvoiceBand OUTLIEROVERALL Sampling_Num Sampling AmountPerBill
## 1    Below 25              N            1       TG            60
## 2    Below 25              N            1       TG           119
## 3    Below 25              N            1       TG           114
## 4    Below 25              N            1       TG            30
## 5       50-75              N            1       TG            35
## 6    Below 25              N            1       TG            85

STEP 4

#From step 3, Identify the customers for targeting 
#using Recency, Frequency and Monetary analysis (google it if not known). 
#Use max transaction date, bill # and Amount respectively for the 3 fields. 
#Select the best 50% customers

todaydate <- Sys.Date()

RFM_act_TG_KD25 <- data.frame(customer_id = act_TG_KD25_Customers$customer_Number, 
                              revenue = act_TG_KD25_Customers$TotalAmount,
                              most_recent_visit = act_TG_KD25_Customers$MaxTransactionDate,
                              number_of_orders = act_TG_KD25_Customers$billCount,
                              recency_days = difftime(todaydate, act_TG_KD25_Customers$MaxTransactionDate)
                              )

head(RFM_act_TG_KD25)
##   customer_id revenue most_recent_visit number_of_orders recency_days
## 1          22      60        2017-02-10                1     763 days
## 2          23     119        2017-02-08                1     765 days
## 3          25     114        2017-02-20                1     753 days
## 4          26      30        2017-02-17                1     756 days
## 5          27      35        2017-02-08                1     765 days
## 6          29      85        2017-02-10                1     763 days
analysis_date <- max(act_TG_KD25_Customers$MaxTransactionDate)

### Using "rfm_table_customer()" function from rfm package

rfm_result <- rfm_table_customer(RFM_act_TG_KD25, customer_id, number_of_orders, recency_days, revenue, analysis_date)

rfm_data1<- data.frame(rfm_result$rfm)
rfm_data1<-arrange(rfm_data1, desc(rfm_data1$rfm_score))
head(rfm_data1)
##   customer_id recency_days transaction_count amount recency_score
## 1         123     755 days                 4 208.00             5
## 2         127     752 days                 4 458.00             5
## 3         212     749 days                 3 749.25             5
## 4         265     766 days                 3 793.00             5
## 5         324     762 days                 5 609.50             5
## 6         328     745 days                 3 280.25             5
##   frequency_score monetary_score rfm_score
## 1               5              5       555
## 2               5              5       555
## 3               5              5       555
## 4               5              5       555
## 5               5              5       555
## 6               5              5       555
############  TOP 50 % Data      #############
rfm_top50percent_data_TG <- rfm_data1[1:(nrow(rfm_data1)/2),]
head(rfm_top50percent_data_TG)
##   customer_id recency_days transaction_count amount recency_score
## 1         123     755 days                 4 208.00             5
## 2         127     752 days                 4 458.00             5
## 3         212     749 days                 3 749.25             5
## 4         265     766 days                 3 793.00             5
## 5         324     762 days                 5 609.50             5
## 6         328     745 days                 3 280.25             5
##   frequency_score monetary_score rfm_score
## 1               5              5       555
## 2               5              5       555
## 3               5              5       555
## 4               5              5       555
## 5               5              5       555
## 6               5              5       555
#######################################

########### VISUALIZATONS FOR RFM ANALYSIS
#The heat map shows the average monetary value for different categories of recency and frequency scores.
rfm_heatmap(rfm_result)

#to generate the distribution of monetary scores for the different combinations of frequency and recency scores.
rfm_bar_chart(rfm_result)

# To examine the relative distribution of monetary value (total revenue generated by each customer)
#recency days (days since the most recent visit for each customer) 
#frequency (transaction count for each customer)
rfm_histograms(rfm_result)
## Warning: attributes are not identical across measure variables;
## they will be dropped

#Visualize the distribution of customers across orders.
rfm_order_dist(rfm_result)

#Recency vs Monetary Value
rfm_rm_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#Frequency vs Monetary Value
rfm_fm_plot(rfm_result)

#Recency vs Frequency
rfm_rf_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#############   RFM Calculation with out using the RFM package ######################
# customer_Number<- act_tg_kd25_cust_agg_tran$customer_Number
# recent_Transanction <- act_tg_kd25_cust_agg_tran$MaxTransactionDate
# bill_Count <- act_tg_kd25_cust_agg_tran$billCount
# total_Amount <- act_tg_kd25_cust_agg_tran$TotalAmount
# 
# rfm <- data.frame(customer_Number, recent_Transanction, bill_Count, total_Amount)
# head(rfm)
# 
# todaydate <- Sys.Date()
# 
# rfm <- mutate(rfm, recency = difftime(todaydate, rfm$recent_Transanction))
# head(rfm)
# rfm <- mutate(rfm, frequency = rfm$bill_Count)
# rfm$bill_Count <- as.numeric(rfm$bill_Count)
# rfm$frequency <- as.numeric(rfm$frequency)
# rfm$recency <- as.character(rfm$recency)
# rfm$recency <- as.numeric(rfm$recency)
# rfm$monetary <- rfm$total_Amount / rfm$bill_Count
# summary(rfm)
# 
# head(rfm)
# 
# df<-rfm
# 
# #start function for assigining score 1- lowest importance, 5- highest importance
# scoring <- function (df,column,r=5){
# 
#   #get the length of rows of df
#   len <- dim(df)[1]
# 
#   score <- rep(0,times=len)
# 
#   # get the quantity of rows per 1/r e.g. 1/5
#   nr <- round(len / r)
#   if (nr > 0){
# 
#     # seperate the rows by r aliquots
#     rStart <-0
#     rEnd <- 0
#     for (i in 1:r){
# 
#       #set the start row number and end row number
#       rStart = rEnd+1
# 
#       #skip one "i" if the rStart is already in the i+1 or i+2 or ...scope.
#       if (rStart> i*nr) next
# 
#       if (i == r){
#         if(rStart<=len ) rEnd <- len else next
#       }else{
#         rEnd <- i*nr
#       }
# 
#       # set the Recency score
#       score[rStart:rEnd]<- r-i+1
# 
#       # make sure the customer who have the same recency have the same score
#       s <- rEnd+1
#       if(i<r & s <= len){
#         for(u in s: len){
#           if(df[rEnd,column]==df[u,column]){
#             score[u]<- r-i+1
#             rEnd <- u
#           }else{
#             break;
#           }
#         }
# 
#       }
# 
#     }
# 
#   }
#   return(score)
# 
# }
# 
# #end of function Scoring
# head(df)
# 
# df<-df %>% arrange(df$recency)
# R_Score <- scoring(df,"recency",r=5)
# df <- cbind(df, R_Score)
# head(df)
# 
# df <- df%>% arrange(desc(df$frequency))
# head(df)
# F_Score <- scoring(df,"frequency",5)
# df <- cbind(df, F_Score)
# head(df)
# 
# 
# df <- df%>% arrange(desc(df$monetary))
# 
# M_Score <- scoring(df,"monetary",5)
# df <- cbind(df, M_Score)
# head(df)
# #order the dataframe by R_Score, F_Score, and M_Score desc
# df <- arrange(df, desc(R_Score), desc(F_Score), desc(M_Score))
# head(df)
# 
# # caculate the total score
# Total_Score <- c(100*df$R_Score + 10*df$F_Score+df$M_Score)
# 
# df <- cbind(df,Total_Score)
# glimpse(df)
# 
# 
# # get best 50% from df
# x<-length(df[,1])/2
# df1<- df[1:x, ]
# glimpse(df1)
# str(df1)
# 
##########################################################################################

STEP 5

#Select RG as a sample from the reference file 
#such that the RG selected represents the selected customers in step 4 
# (use unpaired homoskedastic t-test) such that the ratio of RG: TG is 1:10.

active_rg_KD25_data <- filter(active_Customers, Sampling=="RG" 
                              & (TotalAmount/billCount)>25.0)

head(active_rg_KD25_data)
##   customer_Number billCount item_Count MaxTransactionDate
## 1              73         1          1         2017-02-18
## 2             141         1          1         2016-12-10
## 3             207         1          3         2017-01-02
## 4             213         4         16         2017-02-13
## 5             230         1          3         2016-12-16
## 6             288         1          2         2017-01-21
##   MinTransactionDate MaxAmount TotalAmount sale_Amount sale_Percent
## 1         2017-02-18     35.00       35.00        0.00       0.0000
## 2         2016-12-10     27.75       27.75       27.75       1.0000
## 3         2017-01-02     39.00       57.00       57.00       1.0000
## 4         2016-12-06     87.00      664.25      416.25       0.6266
## 5         2016-12-16     59.00       94.00       35.00       0.3723
## 6         2017-01-21     35.00       50.00       50.00       1.0000
##   MemberCardNumber Customer_Group_Num Customer_Group Revenue Invoice
## 1               73                  1         ACTIVE   57.55       4
## 2              141                  1         ACTIVE   14.20       1
## 3              207                  1         ACTIVE  240.76       6
## 4              213                  1         ACTIVE   17.35       4
## 5              230                  1         ACTIVE   30.70       3
## 6              288                  1         ACTIVE  204.00      12
##   Nationality_Group NationalityClassification Stateclassification
## 1            KUWAIT                    KUWAIT                NONE
## 2            KUWAIT                    KUWAIT                NONE
## 3            KUWAIT                    KUWAIT                NONE
## 4            KUWAIT                    KUWAIT                NONE
## 5            KUWAIT                    KUWAIT                NONE
## 6            KUWAIT                    KUWAIT                NONE
##   AgeClassification RevenueBand_Num RevenueBand InvoiceBand_Num
## 1         REMAINING               3       50-75               3
## 2         REMAINING               2       25-50               1
## 3   18-25 and 26-33               4    Above 75               4
## 4   34-42 and 43-50               2       25-50               3
## 5         REMAINING               2       25-50               2
## 6   18-25 and 26-33               4    Above 75               4
##   InvoiceBand OUTLIEROVERALL Sampling_Num Sampling
## 1       50-75              N            2       RG
## 2    Below 25              N            2       RG
## 3    Above 75              N            2       RG
## 4       50-75              N            2       RG
## 5       25-50              N            2       RG
## 6    Above 75              N            2       RG
todaydate <- Sys.Date()

RFM_act_RG_KD25 <- data.frame(customer_id = active_rg_KD25_data$customer_Number, 
                              revenue = active_rg_KD25_data$TotalAmount,
                              most_recent_visit = active_rg_KD25_data$MaxTransactionDate,
                              number_of_orders = active_rg_KD25_data$billCount,
                              recency_days = difftime(todaydate, active_rg_KD25_data$MaxTransactionDate)
)
DT::datatable(RFM_act_RG_KD25)
analysis_date <- max(active_rg_KD25_data$MaxTransactionDate)

### Using "rfm_table_customer()" function from rfm package

rfm_result_RG <- rfm_table_customer(RFM_act_RG_KD25, customer_id, number_of_orders,recency_days, revenue, analysis_date)


########### VISUALIZATONS FOR RFM ANALYSIS
#The heat map shows the average monetary value for different categories of recency and frequency scores.
rfm_heatmap(rfm_result)

#to generate the distribution of monetary scores for the different combinations of frequency and recency scores.
rfm_bar_chart(rfm_result)

# To examine the relative distribution of monetary value (total revenue generated by each customer)
#recency days (days since the most recent visit for each customer) 
#frequency (transaction count for each customer)
rfm_histograms(rfm_result)
## Warning: attributes are not identical across measure variables;
## they will be dropped

#Visualize the distribution of customers across orders.
rfm_order_dist(rfm_result)

#Recency vs Monetary Value
rfm_rm_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#Frequency vs Monetary Value
rfm_fm_plot(rfm_result)

#Recency vs Frequency
rfm_rf_plot(rfm_result)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

############
rfm_data2<- data.frame(rfm_result_RG$rfm)
head(rfm_data2)
##   customer_id recency_days transaction_count amount recency_score
## 1          73     755 days                 1  35.00             5
## 2         141     825 days                 1  27.75             1
## 3         207     802 days                 1  57.00             3
## 4         213     760 days                 4 664.25             5
## 5         230     819 days                 1  94.00             2
## 6         288     783 days                 1  50.00             4
##   frequency_score monetary_score rfm_score
## 1               1              1       511
## 2               1              1       111
## 3               1              2       312
## 4               5              5       555
## 5               1              3       213
## 6               1              2       412
rfm_data2<-arrange(rfm_data2, desc(rfm_data2$rfm_score))
head(rfm_data2)
##   customer_id recency_days transaction_count  amount recency_score
## 1         213     760 days                 4  664.25             5
## 2         434     749 days                 3 1578.50             5
## 3         692     756 days                 3  220.50             5
## 4         840     763 days                 6  231.25             5
## 5        2001     759 days                 5  637.00             5
## 6        3136     745 days                 3  281.00             5
##   frequency_score monetary_score rfm_score
## 1               5              5       555
## 2               5              5       555
## 3               5              5       555
## 4               5              5       555
## 5               5              5       555
## 6               5              5       555
rfm_top50percent_data_RG <- rfm_data2[1:(nrow(rfm_data2)/2),]

head(rfm_top50percent_data_RG)
##   customer_id recency_days transaction_count  amount recency_score
## 1         213     760 days                 4  664.25             5
## 2         434     749 days                 3 1578.50             5
## 3         692     756 days                 3  220.50             5
## 4         840     763 days                 6  231.25             5
## 5        2001     759 days                 5  637.00             5
## 6        3136     745 days                 3  281.00             5
##   frequency_score monetary_score rfm_score
## 1               5              5       555
## 2               5              5       555
## 3               5              5       555
## 4               5              5       555
## 5               5              5       555
## 6               5              5       555
head(rfm_top50percent_data_TG)
##   customer_id recency_days transaction_count amount recency_score
## 1         123     755 days                 4 208.00             5
## 2         127     752 days                 4 458.00             5
## 3         212     749 days                 3 749.25             5
## 4         265     766 days                 3 793.00             5
## 5         324     762 days                 5 609.50             5
## 6         328     745 days                 3 280.25             5
##   frequency_score monetary_score rfm_score
## 1               5              5       555
## 2               5              5       555
## 3               5              5       555
## 4               5              5       555
## 5               5              5       555
## 6               5              5       555
nrow(rfm_top50percent_data_RG)
## [1] 1118
nrow(rfm_top50percent_data_TG)
## [1] 12745
rg_data <- orignalData %>% select(customer_Number, RevenueBand,TotalAmount,Revenue) %>% 
                            filter(customer_Number  %in% rfm_top50percent_data_RG$customer_id)

tg_data <- orignalData %>% select(customer_Number, RevenueBand,TotalAmount, Revenue) %>%
                          filter( customer_Number %in% rfm_top50percent_data_TG$customer_id)

summary(rg_data)
##  customer_Number RevenueBand         TotalAmount         Revenue       
##  Min.   :   73   Length:1118        Min.   :  25.50   Min.   :   0.50  
##  1st Qu.:12181   Class :character   1st Qu.:  66.81   1st Qu.:  18.61  
##  Median :25230   Mode  :character   Median : 122.75   Median :  48.02  
##  Mean   :24553                      Mean   : 168.21   Mean   :  91.05  
##  3rd Qu.:36304                      3rd Qu.: 212.00   3rd Qu.: 110.28  
##  Max.   :49809                      Max.   :1578.50   Max.   :1969.96
summary(tg_data)
##  customer_Number RevenueBand         TotalAmount        Revenue       
##  Min.   :   22   Length:12745       Min.   :  25.5   Min.   :   0.20  
##  1st Qu.:12642   Class :character   1st Qu.:  62.0   1st Qu.:  18.98  
##  Median :25294   Mode  :character   Median : 116.8   Median :  50.20  
##  Mean   :24923                      Mean   : 161.6   Mean   :  89.77  
##  3rd Qu.:37146                      3rd Qu.: 202.0   3rd Qu.: 114.30  
##  Max.   :49807                      Max.   :2360.5   Max.   :1937.00
table(rg_data$RevenueBand)
## 
##    25-50    50-75 Above 75 Below 25 
##      245      295      366      212
table(tg_data$RevenueBand)
## 
##    25-50    50-75 Above 75 Below 25 
##     2827     3520     4176     2222
rg_25to50  <-  filter(rg_data, rg_data$RevenueBand == "25-50")
rg_50to75  <-  filter(rg_data, rg_data$RevenueBand == "50-75")
rg_Above75 <- filter(rg_data, rg_data$RevenueBand == "Above 75")
rg_Below25 <- filter(rg_data, rg_data$RevenueBand == "Below 25")

tg_25to50  <-  filter(tg_data, tg_data$RevenueBand == "25-50")
tg_50to75  <-  filter(tg_data, tg_data$RevenueBand == "50-75")
tg_Above75 <- filter(tg_data, tg_data$RevenueBand == "Above 75")
tg_Below25 <- filter(tg_data, tg_data$RevenueBand == "Below 25")

set.seed(420)
tg_25to50_sample = sample_n(tg_25to50, 2450)

set.seed(420)
tg_50to75_sample = sample_n(tg_50to75, 2950)

set.seed(420)
tg_Above75_sample = sample_n(tg_Above75, 3660)

set.seed(420)
tg_Below25_sample = sample_n(tg_Below25, 2120)

tg_data_sample <- rbind(tg_25to50_sample,tg_50to75_sample,tg_Above75_sample,tg_Below25_sample)

t-test

############################## t- test #############################


t.test(tg_data_sample$TotalAmount,rg_data$TotalAmount)
## 
##  Welch Two Sample t-test
## 
## data:  tg_data_sample$TotalAmount and rg_data$TotalAmount
## t = -1.3659, df = 1339.9, p-value = 0.1722
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -16.444178   2.944321
## sample estimates:
## mean of x mean of y 
##  161.4574  168.2073

STEP 7

################################    Step 7    #######################
rg<- table(rg_data$RevenueBand)
tg<-table(tg_data$RevenueBand)
rg<-transform(rg)
tg<-transform(tg)
rg_tg<-cbind(rg,tg)

names(rg_tg)[1]<-"Revenue_Band"
names(rg_tg)[2]<- "RG_total_customers"
names(rg_tg)[4]<- "TG_total_customers"
rg_tg <- rg_tg[,-3]


rg_tg$TG_AvgAmountPerMember <- c(sum(tg_25to50$Revenue)/nrow(tg_25to50),
sum(tg_50to75$Revenue)/nrow(tg_50to75),
sum(tg_Above75$Revenue)/nrow(tg_Above75),
sum(tg_Below25$Revenue)/nrow(tg_Below25))

rg_tg$RG_AvgAmountPerMember <- c(sum(rg_25to50$Revenue)/nrow(rg_25to50),
                               sum(rg_50to75$Revenue)/nrow(rg_50to75),
                               sum(rg_Above75$Revenue)/nrow(rg_Above75),
                               sum(rg_Below25$Revenue)/nrow(rg_Below25))

DT::datatable(rg_tg)